# DATA
chrD=read.table(file ="filtered_recombination_data_withBin")
names(chrD)=c("R_ID", "Chrm","PhysD","Pos3.0+1","Index",
"BTAU","BTAU+1","cM","ID1","Database",
"ID2","Type","Informative_Meiosis",
"Kosambi_distance","BinID")
head(chrD)
## R_ID Chrm PhysD Pos3.0+1 Index BTAU BTAU+1 cM ID1
## 1 1 1 822710 822711 1 620748 620749 3.040 rs29017090
## 2 2 1 822930 822931 2 620968 620969 2.845 rs29017089
## 3 3 1 2049815 2049816 3 1857578 1857579 2.531 rs29015832
## 4 4 1 2049855 2049856 4 1857618 1857619 2.843 rs29015833
## 5 5 1 2305629 2305630 5 2114048 2114049 14.092 rs29026024
## 6 6 1 2994006 2994007 6 2792672 2792673 0.985 rs29027420
## Database ID2 Type Informative_Meiosis Kosambi_distance BinID
## 1 NCBI rs29017090 SNP 346 5.703 1
## 2 NCBI rs29017089 SNP 359 0.195 1
## 3 NCBI rs29015832 SNP 173 0.312 2
## 4 NCBI rs29015833 SNP 316 0.002 2
## 5 NCBI rs29026024 SNP 37 0.999 3
## 6 NCBI rs29027420 SNP 233 1.082 3
##############
We are just examining closer the chromosome 1 as an example but this trend is quite repeatable among all chromosomes
We use a cubic fit using least squares for fitting (standard polynomial regression). Note that we use the following convention when regression genetic distance (in cM) against physical distance D (in Mb) \[ cM = a + b D + C D^2 + d D^3 \].
CubReg=lm(cM~PhysD + I(PhysD^2) + I(PhysD^3),
data=chrD_selectionChrm)
summary(CubReg)
##
## Call:
## lm(formula = cM ~ PhysD + I(PhysD^2) + I(PhysD^3), data = chrD_selectionChrm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6379 -1.3776 -0.2759 2.0150 7.0026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.829e+00 4.744e-01 8.072 1.06e-14 ***
## PhysD 1.432e+00 2.723e-02 52.609 < 2e-16 ***
## I(PhysD^2) -8.059e-03 4.101e-04 -19.650 < 2e-16 ***
## I(PhysD^3) 3.318e-05 1.727e-06 19.211 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.579 on 358 degrees of freedom
## Multiple R-squared: 0.9962, Adjusted R-squared: 0.9961
## F-statistic: 3.103e+04 on 3 and 358 DF, p-value: < 2.2e-16
# Terms of the of the cubic equation:
CubReg$coefficients
## (Intercept) PhysD I(PhysD^2) I(PhysD^3)
## 3.829346e+00 1.432352e+00 -8.059209e-03 3.317595e-05
a = CubReg$coefficients[1]
b = CubReg$coefficients[2]
c = CubReg$coefficients[3]
d = CubReg$coefficients[4]
plot(chrD_selectionChrm$PhysD,chrD_selectionChrm$cM, pch=20, cex=0.25, col="grey30",
xlab="Cumulated Physical distance (Mb)", ylab="CUMULATED cM", main=paste("Obs versus fitted trend on chr ", chrm))
lines(chrD_selectionChrm$PhysD,CubReg$fitted.values, type = "l", col="cornflowerblue", lwd=2)
listChr=unique(chrD$Chrm)
Toscale=10^6 # I want to have Mb as physical distance units
for (chrm in 1:29){
chrD_selectionChrm = chrD[chrD[,2]==chrm,]
chrD_selectionChrm$PhysD=chrD_selectionChrm$PhysD/Toscale
CubReg=lm(cM~PhysD + I(PhysD^2) + I(PhysD^3),
data=chrD_selectionChrm)
plot(chrD_selectionChrm$PhysD,chrD_selectionChrm$cM, pch=20, cex=0.25,
col="grey30",
xlab="Cumulated Physical distance (Mb)", ylab="CUMULATED cM",
main=paste("Obs versus fitted trend on chr ", chrm))
lines(chrD_selectionChrm$PhysD,CubReg$fitted.values, type = "l",
col="cornflowerblue", lwd=2)
}
## Examine and parse the estimated local recombination rates from Belen's script.
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
df=read_csv("local_recomb_rate_per_bin_NEW.txt",col_names = c("chr","bin","begin","end","mid", "cMperMb"),progress = T)
## Parsed with column specification:
## cols(
## chr = col_integer(),
## bin = col_integer(),
## begin = col_double(),
## end = col_double(),
## mid = col_double(),
## cMperMb = col_double()
## )
names(df)
## [1] "chr" "bin" "begin" "end" "mid" "cMperMb"
glimpse(df)
## Observations: 1,936
## Variables: 6
## $ chr <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ bin <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ begin <dbl> 822710, 2049815, 2305629, 3553857, 4727118, 5647166, 6...
## $ end <dbl> 822930, 2049855, 3249057, 3601437, 4784221, 5741816, 7...
## $ mid <dbl> 822820, 2049835, 2777343, 3577647, 4755670, 5694491, 6...
## $ cMperMb <dbl> 1.4191572, 1.3997305, 1.3883537, 1.3759603, 1.3579495,...
head(df)
## # A tibble: 6 x 6
## chr bin begin end mid cMperMb
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 822710 822930 822820 1.42
## 2 1 2 2049815 2049855 2049835 1.40
## 3 1 3 2305629 3249057 2777343 1.39
## 4 1 4 3553857 3601437 3577647 1.38
## 5 1 5 4727118 4784221 4755670. 1.36
## 6 1 6 5647166 5741816 5694491 1.34
dim(df)
## [1] 1936 6
summary(df)
## chr bin begin end
## Min. : 1.00 Min. : 1.00 Min. : 169363 Min. : -Inf
## 1st Qu.: 5.00 1st Qu.: 17.00 1st Qu.:25052622 1st Qu.: 12032212
## Median :11.00 Median : 34.00 Median :50154374 Median : 37050024
## Mean :12.28 Mean : 37.74 Mean : Inf Mean : -Inf
## 3rd Qu.:19.00 3rd Qu.: 54.00 3rd Qu.:89438273 3rd Qu.: 64798742
## Max. :29.00 Max. :126.00 Max. : Inf Max. :157837373
##
## mid cMperMb
## Min. : 402021 Min. :0.5574
## 1st Qu.: 21904767 1st Qu.:0.9301
## Median : 43339789 Median :1.1834
## Mean : 48517362 Mean :1.2855
## 3rd Qu.: 69209223 3rd Qu.:1.4993
## Max. :157707058 Max. :4.6415
## NA's :259 NA's :259
df %>% filter(!is.na(cMperMb)) -> df # excluding bins with no LRR
dim(df)
## [1] 1677 6
ggplot(data = df) + geom_histogram(aes(x=cMperMb),binwidth = 0.1)
# per chromosome
df %>% group_by(chr) %>%
summarise(mean=mean(cMperMb),
median=median(cMperMb),
Nbins=n()) ->cMperMBChrom
cMperMBChrom
## # A tibble: 29 x 4
## chr mean median Nbins
## <int> <dbl> <dbl> <int>
## 1 1 0.991 0.924 103
## 2 2 1.16 1.13 85
## 3 3 1.13 1.04 78
## 4 4 1.12 1.04 77
## 5 5 1.09 1.02 72
## 6 6 1.15 1.11 79
## 7 7 1.21 1.11 73
## 8 8 1.13 1.04 75
## 9 9 1.10 1.03 64
## 10 10 1.14 1.05 67
## # ... with 19 more rows
ggplot(data = df) +
geom_histogram(aes(x=cMperMb, fill=chr)) +
facet_wrap(~chr) +
NULL
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.